We’re going to walk through an example workflow for the scABC package, walking through all the individual steps of the analysis. We’ll use the 6 cell line mixture for an example. The 1632 cells can be downloaded from GEO under accession number GSE65360 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE65360).

Pre-processing

We used Kundaje pipeline for aligning pair ended reads to hg19 and removing duplicates (https://github.com/kundajelab/atac_dnase_pipelines) with the command below for each read pair file.

bds_scr [SCR_NAME] atac.bds -align -species hg19 -species_file [SPECIES_FILE_PATH] -nth [NUM_THREADS] -fastq1_1 [READ_PAIR1] -fastq1_2 [READ_PAIR2]

For details on the pipeline, visit the Kundaje lab website or github page.

The resulting bam files were merged using samtools.

samtools merge [AGGREGATE_BAM] *.trim.PE2SE.nodup.bam

The merged bam file was then used as input into MACS2 to call merged peaks for later analysis, using the Kundaje pipeline again.

bds_scr [SCR_NAME] atac.bds -species hg19 -species_file [SPECIES_FILE_PATH] -nth [NUM_THREADS] -se -filt_bam [AGGREGATE_BAM]

Several of the cells have multiple runs. We merged these runs using samtools.

Obtaining and filtering counts

We will use scABC to analyse the single cell data. The cell level bam files and their samtools indexes are contained in the folder bams.

# always run
setwd("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/")
library(devtools)
devtools::install_github("timydaley/scABC", force = TRUE)
## Downloading GitHub repo timydaley/scABC@master
## from URL https://api.github.com/repos/timydaley/scABC/zipball/master
## Installing scABC
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore --quiet CMD INSTALL  \
##   '/private/var/folders/8m/0f0r_lmn4p5bmrzymmp2d0qr0000gn/T/RtmpGMxBvQ/devtools3b4d41d71f03/timydaley-scABC-d2432f3'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.4/Resources/library'  \
##   --install-tests
## 
library(scABC)
# table that has experiment info
bamfile.table = read.table(file = "NoTreatment/6Lines/SRX&Type&Batch.txt")
head(bamfile.table)
##          V1    V2                   V3
## 1 SRX859035 H1ESC singles-H1ESC-well-1
## 2 SRX859036 H1ESC singles-H1ESC-well-2
## 3 SRX859037 H1ESC singles-H1ESC-well-3
## 4 SRX859038 H1ESC singles-H1ESC-well-4
## 5 SRX859039 H1ESC singles-H1ESC-well-5
## 6 SRX859040 H1ESC singles-H1ESC-well-6
bamfiles = paste0("bams/", bamfile.table[,1], ".bam")
peaks = selectPeaks("NoTreatment/6Lines/mergeAll.tn5.pf.gappedPeak")
dim(peaks)
## [1] 242875     15

Computing the Forground and Background matrix takes quite a bit of time. We can do this once and cache the results for later.

#
ForeGround = getCountsMatrix(bamfiles, peaks);
#
ForeGroundFiltered = filterPeaks(ForeGround$ForeGroundMatrix, peaks)
peaks = ForeGroundFiltered$peaks
#
BackGround = getBackground(bamfiles, peaks)
ForeGroundBackGroundFiltered = filterSamples(ForeGround = ForeGroundFiltered$ForeGroundMatrix, BackGround = BackGround$BackGroundMatrix)
InSilicoForeGround = ForeGroundBackGroundFiltered$ForeGroundMatrix
InSilicoBackGround = ForeGroundBackGroundFiltered$BackGroundMatrix
InSilicoPeaks = peaks

Now let’s compute the landmarks and take a look at them.

#
InSilicoLandMarks = computeLandmarks(ForeGround = InSilicoForeGround, 
                                      BackGround = InSilicoBackGround, 
                                      nCluster = 6, lambda = 1, nTop = 2000)
# look at correlation between landmarks
cor(InSilicoLandMarks, InSilicoLandMarks, method = 'spearman')
##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## [1,] 1.0000000 0.5499138 0.4748952 0.4364145 0.4880086 0.4505189
## [2,] 0.5499138 1.0000000 0.5635555 0.4994078 0.6456894 0.4927121
## [3,] 0.4748952 0.5635555 1.0000000 0.4953581 0.5454351 0.4370819
## [4,] 0.4364145 0.4994078 0.4953581 1.0000000 0.4831819 0.4262173
## [5,] 0.4880086 0.6456894 0.5454351 0.4831819 1.0000000 0.4520639
## [6,] 0.4505189 0.4927121 0.4370819 0.4262173 0.4520639 1.0000000

It looks like the cluster landmarks are well seperated. Let’s look at the cell to cluster assignments.

# assign cells to the closest landmark #
InSilicoLandMarkAssignments = assign2landmarks(InSilicoForeGround, InSilicoLandMarks)
InSilicoCell2LandmarkCorrelation = cbind(apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,1], method = 'spearman')), 
                                         apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,2], method = 'spearman')), 
                                         apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,3], method = 'spearman')), 
                                         apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,4], method = 'spearman')), 
                                         apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,5], method = 'spearman')), 
                                         apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,6], method = 'spearman')))
cell.info = bamfile.table[which(paste0(bamfile.table[,1], ".bam") %in% colnames(InSilicoForeGround)), 2]
library(gplots) 
library(RColorBrewer)
library(devtools)
source_url("https://raw.githubusercontent.com/obigriffith/biostar-tutorials/master/Heatmaps/heatmap.3.R")
## SHA-1 hash of file is 015fc0457e61e3e93a903e69a24d96d2dac7b9fb
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoLandMarkAssignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
heatmap.3(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE, Colv=FALSE,
          trace='none', col = scalered, margin = c(5, 5), density.info = "none", 
          RowSideColors = rowcols, RowSideColorsSize=2, symm=F,symkey=F,
          symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

heatmap.2(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE, 
          Colv=FALSE, trace='none', col = scalered, margin = c(5, 5), 
          density.info = "none", RowSideColors = rowcols1, symm=F,
          symkey=F,symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info)), col = rcols1, border=FALSE, 
       bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

To view the above figure better, we’re normalize the correlations of each cell so that the maximum is 1.

# for prettyness
for(i in 1:dim(InSilicoCell2LandmarkCorrelation)[1]){
  InSilicoCell2LandmarkCorrelation[i,] = InSilicoCell2LandmarkCorrelation[i,]/mean(InSilicoCell2LandmarkCorrelation[i,])
}
library(gplots) 
library(RColorBrewer)
heatmap.2(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE, Colv=FALSE, trace='none', col = scalered, margin = c(5, 5), density.info = "none", 
          RowSideColors = rowcols1, symm=F,symkey=F,symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info)), col = rcols1, border=FALSE, 
       bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

The clustering correctly classifies all cells except for 1. This assumes we knew the number of clusters. What if we let scABC choose the number of clusters?

#
InSilicoBackGroundMedian = apply(InSilicoBackGround, 2, median)
InSilicoGapStat = getGapStat(InSilicoForeGround, InSilicoBackGroundMedian, 
                             nClusters=1:10, nPerm = 20, quiet = TRUE)
InSilicoGapStat$nClusterOptimal
## [1] 6
plotGapStat(InSilicoGapStat, nClusters = 1:10, main = "In Silico Gap Stat")

## [1] 1

scABC chooses 6 clusters and we can continue with this number of clusters.

#
InSilicoPeakSelection = getClusterSpecificPvalue(ForeGround=InSilicoForeGround, cluster_assignments = InSilicoLandMarkAssignments, background_medians = InSilicoBackGroundMedian)
## 
## Estimating beta MLE
## 
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 2 % converged
 6 % converged
 10 % converged
 22 % converged
 32 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 35 % converged
 37 % converged
 49 % converged
 91 % converged
 99 % converged
 100 % converged
## Estimating beta MAP
## 
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 0 % converged
 3 % converged
 7 % converged
 19 % converged
 77 % converged
 100 % converged
 100 % converged
 100 % converged
## Calculating the p-values
InSilicoPeakPvals = InSilicoPeakSelection$pvalue
head(InSilicoPeakPvals)
##         Cluster 1 Cluster 2    Cluster 3    Cluster 4 Cluster 5 Cluster 6
## [1,] 1.000000e+00 1.0000000 1.000000e+00 2.325067e-48 1.0000000 1.0000000
## [2,] 3.134483e-07 0.9999997 1.000000e+00 1.000000e+00 1.0000000 1.0000000
## [3,] 1.000000e+00 1.0000000 0.000000e+00 1.000000e+00 1.0000000 1.0000000
## [4,] 9.701213e-01 0.9621131 3.788687e-02 9.996608e-01 0.9948569 0.9974671
## [5,] 9.994836e-01 0.9870934 5.277733e-02 9.472227e-01 0.9999959 0.9975895
## [6,] 9.999988e-01 0.9999994 4.079405e-06 9.999992e-01 0.9999962 0.9999959

Let’s visualize the data.

# all peaks 
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoLandMarkAssignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoForeGround)
x = x[ ,head(order(rowSums(InSilicoForeGround), decreasing = TRUE), 10000)]
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
#truncate to see patterns
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=col.clus, trace='none', col = scalered, 
          margin = c(5, 5), density.info = "none", RowSideColors = rowcols, 
          RowSideColorsSize=2, main = "All peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), 
       col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

We’ll select the cluster specific peaks by selecting the 10,000 peaks with the smallest p-values across all clusters.

# cluster specific peaks
ClusterSpecificPeaks = head(order(apply(InSilicoPeakPvals, 1, min), 
                                  decreasing = FALSE), 10000)
length(ClusterSpecificPeaks)
## [1] 10000
x = t(InSilicoForeGround[ClusterSpecificPeaks, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
#truncate to see patterns
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', 
          col = scalered, margin = c(5, 5), density.info = "none", RowSideColors = rowcols,
          RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), 
       col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

Instead of all of the above commands to obtain the clustered and processed data, we can simply use the scABC command to do all of the above.

library(GenomicRanges)
library(Rsamtools)
library(WeightedCluster)
peakfile = "NoTreatment/6Lines/mergeAll.tn5.pf.gappedPeak"
bamfile.table = read.table(file = "NoTreatment/6Lines/SRX&Type&Batch.txt")
bamfiles = paste0("bams/", bamfile.table[,1], ".bam")
# main function
InSilicoSCABC = scABC(bamfiles, peakfile)

We’ll subset a selection of peaks to look at top 10,000 cluster specific peaks.

#
ClusterSpecificPeaksV2 = head(order(apply(InSilicoSCABC$PeakPVals, 1, min), 
                                    decreasing = FALSE), 10000)
length(intersect(ClusterSpecificPeaks, ClusterSpecificPeaksV2))
## [1] 10000
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoSCABC$cluster_assignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoSCABC$ForeGroundMatrix[ClusterSpecificPeaksV2, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', 
          col = scalered, margin = c(5, 5), density.info = "none", RowSideColors = rowcols,
          RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), 
       col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

We can also select cluster specific peaks by taking peaks that are less than some threshhold. We have to set it to be really small because Rmarkdown does not support large matrices.

#
ClusterSpecificPeaksV3 = which(apply(InSilicoSCABC$PeakPVals, 1, min) < 0.0001)
length(ClusterSpecificPeaksV3)
## [1] 19009
# all peaks are cluster specific
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoSCABC$cluster_assignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoSCABC$ForeGroundMatrix[ClusterSpecificPeaksV3, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), 
          trace='none', col = scalered, margin = c(5, 5), density.info = "none", 
          RowSideColors = rowcols, RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), 
       col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)

Using chromVAR on cluster specific peaks

To identify transcription factors active in cluster specific peaks we’ll use chromVAR (https://greenleaflab.github.io/chromVAR/) to compute TF motifs.

First we’ll look at the naive application of chromVAR without selecting for cluster specific peaks

#
ClusterSpecificPeaks = data.frame(chrom = InSilicoSCABC$peaks$chrom, start = InSilicoSCABC$peaks$start, end = InSilicoSCABC$peaks$end, cluster1pvals = InSilicoSCABC$PeakPVals[,1], cluster2pvals = InSilicoSCABC$PeakPVals[,2], cluster3pvals = InSilicoSCABC$PeakPVals[,3], cluster4pvals = InSilicoSCABC$PeakPVals[,4], cluster5pvals = InSilicoSCABC$PeakPVals[,5], cluster6pvals = InSilicoSCABC$PeakPVals[,6])
dim(ClusterSpecificPeaks)
## [1] 94934     9
write.table(ClusterSpecificPeaks, file = "InSilicoPeaksWithPvals.txt", sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
#BiocInstaller::biocLite("GreenleafLab/chromVAR")
library(chromVAR)
library(motifmatchr)
library(Matrix)
library(SummarizedExperiment)
extra_cols = c(4, 5, 6, 7, 8, 9)
names(extra_cols) = c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6")
peaks = getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/InSilicoPeaksWithPvals.txt", extra_cols = extra_cols)
## Warning in getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/InSilicoPeaksWithPvals.txt", : Peaks are not equal width!
##       Use resize(peaks, width = x, fix = "center") to make peaks equal in size,
##       where x is the desired size of the peaks)
## Warning in getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/
## InSilicoPeaksWithPvals.txt", : Peaks not sorted
# peaks include broad peaks, remove broad peaks
hist(width(peaks), breaks = 200)
length(peaks)
## [1] 94934
peaks = peaks[which(width(peaks) <= 800)]
length(peaks)
## [1] 23566
peaks = resize(peaks, width = 500, fix = "center")
peaks = sort(peaks)
bamfiles = paste0("bams/", colnames(InSilicoSCABC$ForeGroundMatrix))
frag_counts = getCounts(bamfiles, peaks, paired = FALSE, 
                         colData = data.frame(cluster.assigment = factor(InSilicoSCABC$cluster_assignments)))
library(BSgenome.Hsapiens.UCSC.hg19)
frag_counts = addGCBias(frag_counts, genome = BSgenome.Hsapiens.UCSC.hg19)
motifs <- getJasparMotifs()
motifs.matched = matchMotifs(motifs, frag_counts, genome = BSgenome.Hsapiens.UCSC.hg19)
dev = computeDeviations(object = frag_counts, annotations = motifs.matched)
dev.scores = deviationScores(dev)
variability = computeVariability(dev)
plotVariability(variability, use_plotly = TRUE) 

variability[head(order(variability$variability, decreasing = TRUE), 20), ]
##                               name variability bootstrap_lower_bound
## MA0478.1_FOSL2               FOSL2    4.171021              3.824344
## MA0490.1_JUNB                 JUNB    4.093872              3.770376
## MA0477.1_FOSL1               FOSL1    4.079807              3.739060
## MA0491.1_JUND                 JUND    4.074336              3.749512
## MA0099.2_FOS::JUN         FOS::JUN    4.022065              3.675929
## MA0476.1_FOS                   FOS    3.975986              3.662666
## MA0489.1_JUN(var.2)     JUN(var.2)    3.955445              3.647968
## MA0841.1_NFE2                 NFE2    3.248245              2.996394
## MA0655.1_JDP2                 JDP2    3.155490              2.907860
## MA0462.1_BATF::JUN       BATF::JUN    3.085327              2.875677
## MA0080.4_SPI1                 SPI1    2.695883              2.535176
## MA0139.1_CTCF                 CTCF    2.602619              2.461270
## MA0140.2_GATA1::TAL1   GATA1::TAL1    2.421919              2.309954
## MA0653.1_IRF9                 IRF9    2.393844              2.281333
## MA0808.1_TEAD3               TEAD3    2.389406              2.221844
## MA0652.1_IRF8                 IRF8    2.356732              2.246050
## MA0090.2_TEAD1               TEAD1    2.171846              2.025194
## MA0517.1_STAT1::STAT2 STAT1::STAT2    2.165937              2.058199
## MA0809.1_TEAD4               TEAD4    2.134771              1.983942
## MA0501.1_MAF::NFE2       MAF::NFE2    2.038094              1.898431
##                       bootstrap_upper_bound p_value p_value_adj
## MA0478.1_FOSL2                     4.526154       0           0
## MA0490.1_JUNB                      4.446734       0           0
## MA0477.1_FOSL1                     4.427269       0           0
## MA0491.1_JUND                      4.430908       0           0
## MA0099.2_FOS::JUN                  4.368081       0           0
## MA0476.1_FOS                       4.306116       0           0
## MA0489.1_JUN(var.2)                4.292330       0           0
## MA0841.1_NFE2                      3.512678       0           0
## MA0655.1_JDP2                      3.419215       0           0
## MA0462.1_BATF::JUN                 3.323949       0           0
## MA0080.4_SPI1                      2.843321       0           0
## MA0139.1_CTCF                      2.752233       0           0
## MA0140.2_GATA1::TAL1               2.529015       0           0
## MA0653.1_IRF9                      2.506730       0           0
## MA0808.1_TEAD3                     2.563882       0           0
## MA0652.1_IRF8                      2.472861       0           0
## MA0090.2_TEAD1                     2.317526       0           0
## MA0517.1_STAT1::STAT2              2.267102       0           0
## MA0809.1_TEAD4                     2.280900       0           0
## MA0501.1_MAF::NFE2                 2.183571       0           0
tsne = deviationsTsne(dev, threshold = 1, perplexity = 10)
tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "FOSL2", 
                                 sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[1]]

tsne_plot[[2]]

tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "CTCF", 
                                 sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[2]]

tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "GATA1::TAL1", 
                                 sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[2]]

top_motifs = variability$name[head(order(variability$variability, decreasing = TRUE), 20)]
names(top_motifs) = rownames(variability[head(order(variability$variability, decreasing = TRUE), 20), ])
top_devs = dev.scores[which(rownames(dev.scores) %in% names(top_motifs)), ]
rownames(top_devs) = top_motifs[match(rownames(top_devs), names(top_motifs))]
library(gplots) 
library(RColorBrewer)
scalebluered <- colorRampPalette(c("blue", "white", "red"), space = "rgb")(256)
cols = brewer.pal(6, "Dark2")[1:6]
cols = cols[c(6, 1, 4, 2, 5, 3)] # reorder to agree with Zhi's figure
rowcols = cols[InSilicoSCABC$cluster_assignments]
library(vegan)
d = vegdist(top_devs, method = "euclidean")
col.clus = hclust(d, "centroid")
heatmap.2(t(top_devs), dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', col = scalebluered, density.info = "none", RowSideColors = rowcols, margin = c(10, 1))

Most of the TFs are BJ specific. Let’s look at when we take cluster specific peaks, but an equal number of each cluster.

topPeaks = peaks[which(apply(elementMetadata(peaks), 1, min) < 1e-5)]
fragCountsTopPeaks = getCounts(bamfiles, topPeaks, paired = FALSE, 
                         colData = data.frame(cluster.assigment = factor(InSilicoSCABC$cluster_assignments)))
fragCountsTopPeaks = addGCBias(fragCountsTopPeaks, genome = BSgenome.Hsapiens.UCSC.hg19)
motifs <- getJasparMotifs()
motifsTopPeaks.matched = matchMotifs(motifs, fragCountsTopPeaks, genome = BSgenome.Hsapiens.UCSC.hg19)
devTopPeaks = computeDeviations(object = fragCountsTopPeaks, annotations = motifsTopPeaks.matched)
devTopPeaks.scores = deviationScores(devTopPeaks)
variabilityTopPeaks = computeVariability(devTopPeaks)
plotVariability(variabilityTopPeaks, use_plotly = TRUE) 
variabilityTopPeaks[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20), ]
##                               name variability bootstrap_lower_bound
## MA0478.1_FOSL2               FOSL2    2.268453              2.094568
## MA0490.1_JUNB                 JUNB    2.219077              2.050912
## MA0099.2_FOS::JUN         FOS::JUN    2.217914              2.033092
## MA0477.1_FOSL1               FOSL1    2.201068              2.039151
## MA0491.1_JUND                 JUND    2.194246              2.023153
## MA0476.1_FOS                   FOS    2.139884              1.979456
## MA0489.1_JUN(var.2)     JUN(var.2)    2.089581              1.919563
## MA0652.1_IRF8                 IRF8    1.834726              1.742883
## MA0140.2_GATA1::TAL1   GATA1::TAL1    1.820301              1.736505
## MA0517.1_STAT1::STAT2 STAT1::STAT2    1.771074              1.690564
## MA0841.1_NFE2                 NFE2    1.757561              1.621410
## MA0653.1_IRF9                 IRF9    1.718928              1.646433
## MA0655.1_JDP2                 JDP2    1.708895              1.571240
## MA0830.1_TCF4                 TCF4    1.681157              1.601761
## MA0522.2_TCF3                 TCF3    1.652910              1.570793
## MA0462.1_BATF::JUN       BATF::JUN    1.650556              1.530218
## MA0778.1_NFKB2               NFKB2    1.613152              1.529105
## MA0050.2_IRF1                 IRF1    1.581729              1.503034
## MA0080.4_SPI1                 SPI1    1.569772              1.485416
## MA0808.1_TEAD3               TEAD3    1.550953              1.455613
##                       bootstrap_upper_bound       p_value   p_value_adj
## MA0478.1_FOSL2                     2.426135  0.000000e+00  0.000000e+00
## MA0490.1_JUNB                      2.370993  0.000000e+00  0.000000e+00
## MA0099.2_FOS::JUN                  2.375615  0.000000e+00  0.000000e+00
## MA0477.1_FOSL1                     2.347924  0.000000e+00  0.000000e+00
## MA0491.1_JUND                      2.350667  0.000000e+00  0.000000e+00
## MA0476.1_FOS                       2.280780  0.000000e+00  0.000000e+00
## MA0489.1_JUN(var.2)                2.236815  0.000000e+00  0.000000e+00
## MA0652.1_IRF8                      1.917120 2.491118e-244 1.201964e-242
## MA0140.2_GATA1::TAL1               1.903609 1.399384e-236 6.001801e-235
## MA0517.1_STAT1::STAT2              1.844010 5.457580e-211 2.106626e-209
## MA0841.1_NFE2                      1.880238 3.379196e-204 1.185791e-202
## MA0653.1_IRF9                      1.792765 2.436011e-185 7.835837e-184
## MA0655.1_JDP2                      1.833406 1.403980e-180 4.168742e-179
## MA0830.1_TCF4                      1.760106 1.035880e-167 2.856069e-166
## MA0522.2_TCF3                      1.724996 4.690590e-155 1.207045e-153
## MA0462.1_BATF::JUN                 1.759297 5.072664e-154 1.223780e-152
## MA0778.1_NFKB2                     1.694941 5.141825e-138 1.167497e-136
## MA0050.2_IRF1                      1.656781 3.405758e-125 7.303459e-124
## MA0080.4_SPI1                      1.649027 1.815860e-120 3.689063e-119
## MA0808.1_TEAD3                     1.639982 3.378729e-113 6.520947e-112
intersect(variabilityTopPeaks$name[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20)], variability$name[head(order(variability$variability, decreasing = TRUE), 20)])
##  [1] "FOSL2"        "JUNB"         "FOS::JUN"     "FOSL1"       
##  [5] "JUND"         "FOS"          "JUN(var.2)"   "IRF8"        
##  [9] "GATA1::TAL1"  "STAT1::STAT2" "NFE2"         "IRF9"        
## [13] "JDP2"         "BATF::JUN"    "SPI1"         "TEAD3"
tsneTopPeaks = deviationsTsne(devTopPeaks, threshold = 1, perplexity = 10)
tsneTopPeaks_plot = plotDeviationsTsne(devTopPeaks, tsneTopPeaks, annotation = "FOSL2", 
                                 sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[1]]

A large number intersect, which is good.

topMotifsTopPeaks = variabilityTopPeaks$name[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20)]
names(topMotifsTopPeaks) = rownames(variabilityTopPeaks[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20), ])
topDevsTopPeaks = devTopPeaks.scores[which(rownames(devTopPeaks.scores) %in% names(topMotifsTopPeaks)), ]
rownames(topDevsTopPeaks) = topMotifsTopPeaks[match(rownames(topDevsTopPeaks), names(topMotifsTopPeaks))]
topDevsTopPeaks[which(is.na(topDevsTopPeaks))] = 0
library(gplots) 
library(RColorBrewer)
scalebluered <- colorRampPalette(c("blue", "white", "red"), space = "rgb")(256)
cols = brewer.pal(6, "Dark2")[1:6]
cols = cols[c(6, 1, 4, 2, 5, 3)] # reorder to agree with Zhi's figure
rowcols = cols[InSilicoSCABC$cluster_assignments]
library(vegan)
d = vegdist(topDevsTopPeaks, method = "euclidean")
col.clus = hclust(d, "centroid")
heatmap.2(t(topDevsTopPeaks), dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', col = scalebluered, density.info = "none", RowSideColors = rowcols, margin = c(10, 1))

In the cluster specific peaks the Jun family motifs in BJ are still present, but now we see the Gata1-Tal1 motif in K562 cells (see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3431495/ or http://mcb.asm.org/content/25/4/1215.full) and Pou motifs in H1 cells (espcially Pou3f2, aka Oct4).

Session Info

sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] vegan_2.4-4                       lattice_0.20-35                  
##  [3] permute_0.9-4                     BSgenome.Hsapiens.UCSC.hg19_1.4.0
##  [5] BSgenome_1.44.1                   rtracklayer_1.36.4               
##  [7] SummarizedExperiment_1.6.3        DelayedArray_0.2.7               
##  [9] matrixStats_0.52.2                Biobase_2.36.2                   
## [11] Matrix_1.2-11                     motifmatchr_0.99.8               
## [13] chromVAR_0.5.2                    RColorBrewer_1.1-2               
## [15] gplots_3.0.1                      WeightedCluster_1.2-1            
## [17] cluster_2.0.6                     TraMineR_2.0-7                   
## [19] Rsamtools_1.28.0                  Biostrings_2.44.2                
## [21] XVector_0.16.0                    GenomicRanges_1.28.5             
## [23] GenomeInfoDb_1.12.2               IRanges_2.10.3                   
## [25] S4Vectors_0.14.4                  BiocGenerics_0.22.0              
## [27] scABC_0.99.0                      devtools_1.13.3                  
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.13                  VGAM_1.0-4                 
##   [3] colorspace_1.3-2            rprojroot_1.2              
##   [5] htmlTable_1.9               base64enc_0.1-3            
##   [7] DT_0.2                      bit64_0.9-7                
##   [9] AnnotationDbi_1.38.2        codetools_0.2-15           
##  [11] splines_3.4.1               R.methodsS3_1.7.1          
##  [13] knitr_1.17                  jsonlite_1.5               
##  [15] Formula_1.2-2               seqLogo_1.42.0             
##  [17] annotate_1.54.0             GO.db_3.4.1                
##  [19] png_0.1-7                   R.oo_1.21.0                
##  [21] shiny_1.0.5                 readr_1.1.1                
##  [23] compiler_3.4.1              httr_1.3.1                 
##  [25] backports_1.1.0             assertthat_0.2.0           
##  [27] lazyeval_0.2.0              acepack_1.4.1              
##  [29] htmltools_0.3.6             tools_3.4.1                
##  [31] bindrcpp_0.2                glue_1.1.1                 
##  [33] gtable_0.2.0                TFMPvalue_0.0.6            
##  [35] GenomeInfoDbData_0.99.0     reshape2_1.4.2             
##  [37] dplyr_0.7.3                 Rcpp_0.12.12               
##  [39] nlme_3.1-131                gdata_2.18.0               
##  [41] crosstalk_1.0.0             CNEr_1.12.1                
##  [43] stringr_1.2.0               mime_0.5                   
##  [45] miniUI_0.1.1                poweRlaw_0.70.1            
##  [47] gtools_3.5.0                XML_3.98-1.9               
##  [49] JASPAR2016_1.4.0            MASS_7.3-47                
##  [51] zlibbioc_1.22.0             scales_0.5.0               
##  [53] hms_0.3                     yaml_2.1.14                
##  [55] curl_2.8.1                  memoise_1.1.0              
##  [57] gridExtra_2.3               ggplot2_2.2.1              
##  [59] rpart_4.1-11                latticeExtra_0.6-28        
##  [61] stringi_1.1.5               RSQLite_2.0                
##  [63] checkmate_1.8.3             caTools_1.17.1             
##  [65] boot_1.3-20                 BiocParallel_1.10.1        
##  [67] rlang_0.1.2                 pkgconfig_2.0.1            
##  [69] bitops_1.0-6                evaluate_0.10.1            
##  [71] purrr_0.2.3                 bindr_0.1                  
##  [73] labeling_0.3                GenomicAlignments_1.12.2   
##  [75] htmlwidgets_0.9             bit_1.1-12                 
##  [77] plyr_1.8.4                  magrittr_1.5               
##  [79] R6_2.2.2                    Hmisc_4.0-3                
##  [81] DBI_0.7                     mgcv_1.8-20                
##  [83] foreign_0.8-69              withr_2.0.0                
##  [85] survival_2.41-3             KEGGREST_1.16.1            
##  [87] RCurl_1.95-4.8              nnet_7.3-12                
##  [89] tibble_1.3.4                KernSmooth_2.23-15         
##  [91] plotly_4.7.1                nabor_0.4.7                
##  [93] rmarkdown_1.6               TFBSTools_1.14.2           
##  [95] grid_3.4.1                  data.table_1.10.4          
##  [97] blob_1.1.0                  git2r_0.19.0               
##  [99] digest_0.6.12               xtable_1.8-2               
## [101] tidyr_0.7.1                 httpuv_1.3.5               
## [103] R.utils_2.5.0               munsell_0.4.3              
## [105] DirichletMultinomial_1.18.0 viridisLite_0.2.0